import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from math import pi
from prince import PCA
from umap.umap_ import UMAP
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScalerdatos = pd.read_csv('../data/gimnasio.csv', delimiter = ';', decimal = ".")
pred = datos["Experiencia"]
datos = datos.drop(['Experiencia'], axis=1)
datos = pd.get_dummies(datos)
datos.dtypesEdad int64
Peso float64
Altura float64
Max_BPM int64
Avg_BPM int64
Rep_BPM int64
Duracion float64
Calorias int64
Grasa float64
Agua float64
Frecuencia int64
IMC float64
Genero_Femenino bool
Genero_Masculino bool
Entrenamiento_Cardio bool
Entrenamiento_Fuerza bool
Entrenamiento_HIIT bool
Entrenamiento_Yoga bool
dtype: object
escalar = StandardScaler()
datos_escalados = escalar.fit_transform(datos)
datos_escalados = pd.DataFrame(datos_escalados)
datos_escalados.columns = datos.columns
datos_escalados.index = datos.index
datos_escalados| Edad | Peso | Altura | Max_BPM | Avg_BPM | Rep_BPM | Duracion | Calorias | Grasa | Agua | Frecuencia | IMC | Genero_Femenino | Genero_Masculino | Entrenamiento_Cardio | Entrenamiento_Fuerza | Entrenamiento_HIIT | Entrenamiento_Yoga | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.422343 | 0.681493 | -0.098545 | 0.010081 | 0.922970 | -0.303555 | 1.264598 | 1.495690 | -1.978321 | 1.455967 | 0.743295 | 0.794278 | -0.950847 | 0.950847 | -0.595947 | -0.600699 | -0.542110 | 1.752464 |
| 1 | 0.600965 | 0.049316 | -1.508604 | -0.076726 | 0.504494 | 0.515749 | 0.127098 | -0.082284 | 1.426301 | -0.877898 | 0.743295 | 1.064652 | 1.051694 | -1.051694 | -0.595947 | -0.600699 | 1.844645 | -0.570625 |
| 2 | -0.548964 | -0.271491 | -0.490228 | -1.118414 | -1.518142 | -1.122858 | -0.427068 | -0.838243 | 1.346380 | -0.544488 | 0.743295 | -0.030361 | 1.051694 | -1.051694 | 1.678001 | -0.600699 | -0.542110 | -0.570625 |
| 3 | -1.123928 | -0.974433 | -0.176881 | 0.878155 | 1.411193 | -0.849757 | -1.943735 | -1.370351 | 0.611110 | -0.877898 | -0.352502 | -0.976669 | -0.950847 | 0.950847 | -0.595947 | 1.664728 | -0.542110 | -0.570625 |
| 4 | -0.056137 | -1.309393 | 0.528148 | 0.704540 | 0.992716 | 0.788850 | -1.797902 | -1.282278 | 0.675047 | 0.289035 | -0.352502 | -1.580503 | -0.950847 | 0.950847 | -0.595947 | 1.664728 | -0.542110 | -0.570625 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 968 | -1.206066 | 0.624880 | 0.136465 | 0.617733 | 0.992716 | 0.652299 | 0.914598 | 1.682845 | -2.393908 | 1.455967 | 0.743295 | 0.579482 | -0.950847 | 0.950847 | -0.595947 | 1.664728 | -0.542110 | -0.570625 |
| 969 | -1.123928 | -0.342257 | -0.881911 | 0.357311 | 1.550685 | -0.849757 | 0.360432 | 1.301196 | 0.003713 | 0.622444 | -1.448299 | 0.116842 | -0.950847 | 0.950847 | -0.595947 | 1.664728 | -0.542110 | -0.570625 |
| 970 | 1.668756 | -0.634756 | 0.293138 | 1.225384 | -1.657634 | -1.259409 | 1.352098 | 0.086523 | -0.987304 | 0.122330 | 1.839092 | -0.812942 | 1.051694 | -1.051694 | 1.678001 | -0.600699 | -0.542110 | -0.570625 |
| 971 | -0.548964 | 2.478951 | 0.841495 | 1.572614 | 0.155764 | -0.030454 | -0.456235 | -0.082284 | 0.515205 | -0.877898 | -0.352502 | 1.926843 | -0.950847 | 0.950847 | -0.595947 | -0.600699 | 1.844645 | -0.570625 |
| 972 | 0.600965 | 0.700363 | -0.725238 | -1.205221 | 0.155764 | 0.515749 | -1.477068 | -1.333653 | 0.611110 | 1.455967 | -1.448299 | 1.271938 | -0.950847 | 0.950847 | -0.595947 | 1.664728 | -0.542110 | -0.570625 |
973 rows × 18 columns
n_components: Especifica el número de componentes principales que se desean conservar.
pca = PCA(n_components = 5)
pca.fit(datos_escalados)
individuos = pca.row_coordinates(datos_escalados)
individuosPCA(n_components=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| rescale_with_mean | True | |
| rescale_with_std | True | |
| n_components | 5 | |
| n_iter | 3 | |
| copy | True | |
| check_input | True | |
| random_state | None | |
| engine | 'sklearn' |
| component | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| 0 | 3.329860 | -1.249739 | -0.290032 | -1.030253 | -0.579701 |
| 1 | -1.692899 | -0.751435 | 2.037967 | 0.125233 | -2.000937 |
| 2 | -2.089927 | 0.088521 | 1.028547 | -0.720738 | 1.574962 |
| 3 | -1.199111 | 1.679009 | -1.294595 | 1.758996 | 0.426842 |
| 4 | -0.769177 | 1.584771 | -2.410124 | 1.820237 | 0.480036 |
| ... | ... | ... | ... | ... | ... |
| 968 | 3.397728 | -1.451037 | 0.101017 | 1.861335 | 0.277125 |
| 969 | 0.903832 | -0.061699 | -0.016815 | 1.998531 | 0.354720 |
| 970 | -0.188443 | -2.204169 | 0.165569 | -1.276879 | 1.425872 |
| 971 | 1.546681 | 2.159396 | 2.124486 | -0.076394 | -1.976363 |
| 972 | 0.407555 | 2.867745 | 0.048714 | 2.256457 | -0.311599 |
973 rows × 5 columns
x = individuos.iloc[:, 0]
y = individuos.iloc[:, 1]
fig = go.Figure()
for cat in pred.unique():
no_plot = fig.add_trace(
go.Scatter(x = x[pred == cat], y = y[pred == cat], mode = 'markers', name = cat)
)
fig.update_layout(
xaxis_title = "Componente 1",
yaxis_title = "Componente 2",
legend_title = "Experiencia",
legend = dict(
yanchor = "top", y = 0.99,
xanchor = "left", x = 0.01
)
)n_components: Indica la cantidad de componentes que vamos a generar.
n_neighbors: Este parámetro controla como UMAP equilibra la estructura local versus global en los datos. Lo hace restringiendo el tamaño del vecindario local. Se recomienda utilizar una cantidad de \(\frac{n}{k}\) donde \(n\) es la canitdad total de datos y \(k\) la cantidad de clusters a formar.
umap = UMAP(n_components = 2, n_neighbors = 325)
individuos = umap.fit_transform(datos_escalados)
individuos = pd.DataFrame(individuos, index=datos_escalados.index)
individuos| 0 | 1 | |
|---|---|---|
| 0 | 5.396263 | 6.263515 |
| 1 | 11.057268 | 3.508857 |
| 2 | 11.765102 | 5.852460 |
| 3 | 8.896211 | 8.043849 |
| 4 | 8.809156 | 8.052700 |
| ... | ... | ... |
| 968 | 4.960216 | 7.135543 |
| 969 | 8.537649 | 8.418722 |
| 970 | 10.523856 | 5.395488 |
| 971 | 6.339933 | 8.125097 |
| 972 | 7.936487 | 7.753097 |
973 rows × 2 columns
x = individuos.iloc[:, 0]
y = individuos.iloc[:, 1]
fig = go.Figure()
for cat in pred.unique():
no_plot = fig.add_trace(
go.Scatter(x = x[pred == cat], y = y[pred == cat], mode = 'markers', name = cat)
)
fig.update_layout(
xaxis_title = "Componente 1",
yaxis_title = "Componente 2",
legend_title = "Experiencia",
legend = dict(
yanchor = "top", y = 0.99,
xanchor = "left", x = 0.01
)
)n_components: cantidad de componentes a generar.
perplexity: La perplejidad está relacionada con el número de vecinos más cercanos que se utiliza en otros múltiples algoritmos de aprendizaje. Los conjuntos de datos más grandes suelen requerir una mayor perplejidad. Considere seleccionar un valor entre 5 y 50. Valores diferentes pueden generar resultados significativamente diferentes. La perplejidad debe ser menor que el número de muestras.
learning_rate: La tasa de aprendizaje de t-SNE suele estar en el rango [10,0, 1000,0], es preferible dejarla en automatica.
tsne = TSNE(n_components=2, perplexity=10, learning_rate='auto', init='random')
individuos = tsne.fit_transform(datos_escalados)
individuos = pd.DataFrame(individuos, index=datos_escalados.index)
individuos| 0 | 1 | |
|---|---|---|
| 0 | -8.725916 | -24.905657 |
| 1 | 61.086277 | -7.725172 |
| 2 | 4.803576 | -4.994265 |
| 3 | -55.744488 | -25.657858 |
| 4 | -56.188995 | -24.579985 |
| ... | ... | ... |
| 968 | -22.261967 | -4.612391 |
| 969 | -50.129219 | -22.464746 |
| 970 | 15.969269 | -15.394198 |
| 971 | -12.121456 | 55.667549 |
| 972 | -43.028133 | -6.995152 |
973 rows × 2 columns
x = individuos.iloc[:, 0]
y = individuos.iloc[:, 1]
fig = go.Figure()
for cat in pred.unique():
no_plot = fig.add_trace(
go.Scatter(x = x[pred == cat], y = y[pred == cat], mode = 'markers', name = cat)
)
fig.update_layout(
xaxis_title = "Componente 1",
yaxis_title = "Componente 2",
legend_title = "Experiencia",
legend = dict(
yanchor = "top", y = 0.99,
xanchor = "left", x = 0.01
)
)